Jee's Facebook Chat Analysis

Goal: Data mine my Facebook chat history.


In [1]:
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')

# Filename for Facebook chat history, downloaded from Facebook.com via its personal data archival functionality
FILENAME = 'messages-2016-02.htm'

In [2]:
# Utility functions
import lxml
import itertools
import dateparser
import csv
import pandas
import tqdm
import numpy
import datetime
from lxml import etree

def parse_html():
    f = open(FILENAME, 'rb')
    parser = etree.HTMLParser()
    tree = etree.parse(f, parser)
    return tree

def get_threads(node):
    return node.xpath("//div[@class='thread']")
    
def get_messages(thread):
    title = thread.xpath("./text()")[0]
    messages = thread.xpath(".//*[@class='message']")
    ps = thread.xpath(".//p")
    assert len(messages) == len(ps)
    n = len(messages)
    parsed_messages = []
    for message, p in itertools.izip(messages, ps):
        parsed_messages.append(get_single_message(message, p))
    return {
        'title': title,
        'messages': parsed_messages,
    }
    
def get_single_message(message, p):
    user = message.xpath(".//*[@class='user']/text()")[0]
    meta = message.xpath(".//*[@class='meta']/text()")[0]
    return {
        'user': user,
        'date': parse_date(meta),
        # 'meta_raw': meta,
        'text': p.text
    }
    
def parse_date(raw_text):
    # SUPER DUPER SLOW - DO NOT USE
    # return dateparser.parse(raw_text, settings={'TIMEZONE': 'US/Pacific'})
    # TODO - timezone support
    return datetime.datetime.strptime(raw_text[:-4], "%A, %B %d, %Y at %I:%M%p")

def convert_to_dataframe(parsed_thread):
    df = pandas.DataFrame.from_dict(parsed_thread['messages'])
    df['thread'] = parsed_thread['title']
    return df

def pretty_print(node):
    print etree.tostring(node, pretty_print=True, method='html')[:10000]

In [3]:
# Read & Parse the HTML
tree = parse_html()
threads = get_threads(tree)

In [4]:
parsed_messages = []
for thread in tqdm.tqdm(threads):
    parsed_messages.append(get_messages(thread))




In [5]:
df = None
for parsed_message in tqdm.tqdm(parsed_messages):
    if df is None:
        df = convert_to_dataframe(parsed_message)
    else:
        df = df.append(convert_to_dataframe(parsed_message))




In [6]:
# Add additional column for text length
df['textlen'] = df['text'].apply(lambda x: len(x) if x is not None else 0)

In [7]:
# normalize data to "year + month"
TOP = 20
df['yearmonth'] = df['date'].apply(lambda dt: datetime.datetime(dt.year, dt.month, 1, 0,0))
top_counts = df[['thread', 'textlen']].groupby('thread')['textlen'].sum().sort_values(ascending=False)
top_counts = pandas.DataFrame(top_counts)
top_counts.rename(columns = {'textlen':'count'}, inplace=True)
top_counts = top_counts[:TOP]

In [8]:
# everythong other than "top_count" is categorized as "etc".
highest = pandas.DataFrame({'thread':top_counts.index, 'count': top_counts['count'], 'thread_grouped':top_counts.index})')
only_highest = pandas.merge(df, highest, on='thread', how='outer')
only_highest['thread_grouped'].fillna(value='etc', inplace=True)

In [24]:
# get the histogram of chat messages, grouped by top 20 + etc.
counts = only_highest[['date', 'thread_grouped', 'textlen']]\
.groupby(by=(only_highest.yearmonth, only_highest.thread_grouped))[['textlen']].sum()
counts.rename(columns={'textlen':'count'}, inplace=True)
counts = counts.unstack().resample('MS').fillna(0)
counts = counts.stack()

In [25]:
# plot
pivot = counts.reset_index().pivot(index='yearmonth',  columns='thread_grouped', values='count')
pivot = pivot.fillna(value=0)
pivot = pivot[highest.index.tolist() + ['etc']]
plot = pivot.plot(figsize=(30, 5), kind='area', colormap='Paired', legend=False, title='Facebook Chat Volume')
plot.set_ylabel("char / month")
plot.set_xlabel("time")


Out[25]:
<matplotlib.text.Text at 0x125cb4850>

In [26]:
sent_vs_received = pandas.DataFrame({'is_me': (df['user'] == 'Jeeyoung Kim'), 'date': df['date']})
sent_vs_received['yearmonth'] = sent_vs_received['date'].apply(lambda dt: datetime.datetime(dt.year, dt.month, 1, 0,0))
sent_vs_received_aggregated = sent_vs_received.groupby(('yearmonth', 'is_me')).count()
sent_vs_received_aggregated = sent_vs_received_aggregated.rename(columns={'date':'count'})
sent_vs_received_aggregated = sent_vs_received_aggregated.unstack().resample('MS').fillna(0).stack()

In [27]:
pivot = sent_vs_received_aggregated.reset_index().pivot(index='yearmonth',  columns='is_me', values='count')
pivot = pivot.fillna(value=0)
plot = pivot2.plot(figsize=(30, 5), kind='line', colormap='Paired', legend=True, title='Sent vs Received')
plot.set_ylabel("message / month")
plot.set_xlabel("time")


Out[27]:
<matplotlib.text.Text at 0x11f4c3250>